#===============================================================================
#  File:    Translations.R
#  Date:    December, 2021 
#  Author:  Natalia Umansky
#  Purpose: Identify and translate all non-English tweets
#===============================================================================

# LIBRARIES
#===============================================================================
library(rtweet)
library(tidyverse)
library(stringi)
library(qdapRegex)
library(googleLanguageR)
library(rtweet)
library(arsenal)
library(cld2)
library(data.table)

#Import data
#===============================================================================
Groups <- c("Politicians", "Media", "Citizens", "Advocates", "Friends")

for (i in Groups){
  df <- fread(paste0("~/",i,".csv"), 
              select = c("user_id", "status_id", "created_at", "screen_name", "text", "hashtags"))
  df$group <- rep(i, length(df$fit))
  assign(i, df)
}


Tweets <- rbind(Media, Politicians, Citizens, Advocates, Friends)

Tweets<- Tweets[!duplicated(Tweets$status_id), ]

#Clean data
#===============================================================================

Tweets$text <- Tweets$text %>%
  stri_replace_all_fixed(pattern= "@\\w+*", "")%>%
  rm_twitter_url() %>%
  stri_replace_all_fixed(pattern="[:emoji:]",replacement="") %>%
  str_replace_all("amp", "") %>%
  str_replace_all("@", "")%>%
  str_replace_all("#", "")%>%
  str_remove_all('[[:punct:]]')%>%
  str_replace_all("[^\x01-\x7F]", "") %>%  #removes emojis
  str_replace_all('[[:digit:]]+', "")

Tweets <-  filter(Tweets, !is.na(text))

#Detect language
#===============================================================================

text <- as.character(Tweets[['text']])

Tweets$language <- as.data.frame(detect_language(text, plain_text = F))

#Select non-English tweets
#===============================================================================

nonEng <- filter(Tweets, language!="en")

nonEng <- filter(nonEng, !text==" ")

#Select English tweets
#===============================================================================

Eng <- Tweets %>% 
  filter(language=="en")

#Translating non-English tweets
#===============================================================================
gl_auth("~/auth.json")

nonEng$translated <- gl_translate(nonEng$text, target = "en") 

#Creating final dataframe
#===============================================================================

colnames(nonEng)[5] <- "original"

colnames(nonEng)[8] <- "text"

nonEng <- nonEng %>% 
  select (user_id, status_id, created_at, screen_name, text, everything())

nonEng <- nonEng[ , !(colnames(nonEng) %in% "original")]

nonEng$text <- as.character(nonEng$text)

Tweets <- rbind(Eng, nonEng)

#Filter Tweets not identified as English
#===============================================================================

text <- as.character(Tweets[['text']])

Tweets$language <- as.data.frame(detect_language(text, plain_text = T))

Tweets <- Tweets %>% 
  filter(language=="en")

Tweets<- Tweets[ , !(colnames(Tweets) %in% "language")]

#Divide and save the translated Tweets
#===============================================================================

for (i in Groups){
  df <- Tweets %>% filter(group == i)
  write.csv(df, file= paste0(i,"_translated.csv"), fileEncoding = "UTF-8")
}


